title:"Statistics Assignment"
author: "Himal Baskota"
date: "2025/05/11"
Sys.setenv("R_INTERACTIVE_DEVICE" = "png")
options(repos = c(CRAN = "https://cran.r-project.org"))
if (!require("rsample")) install.packages("rsample")
## Loading required package: rsample
if (!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
#importing needed library
library(rsample)
library(ggplot2) # For creating plots
#importing input data from csv file
X=as.matrix(read.csv(file="/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/x.csv",header = F))
colnames(X)<-c("x1","x3","x4","x5")
#displaying imported data
head(X)
## x1 x3 x4 x5
## [1,] 8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
#importing Output data from y.csv
Y=as.matrix(read.csv(file="/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/y.csv",header = F))
colnames(Y)<-c("y")
#displaying output data
head(Y)
## y
## [1,] 480.48
## [2,] 445.75
## [3,] 438.76
## [4,] 453.09
## [5,] 464.43
## [6,] 470.96
#importing time series data
time = read.csv("/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/t.csv", header = F)
time = as.matrix(time)
#displaying time series data
head(time)
## V1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
#task 1.1 #plotting time series data
X.ts<-ts(X,start = c(min(time),max(time)),frequency =1)
Y.ts<-ts(Y,start = c(min(time),max(time)),frequency =1)
#plotting timeseries data of input and target variable
plot(X.ts[0:50,1],main = "Time series plot of Ambient Temperature", xlab = "Time", ylab = "°C" , col = "#1130ee", type = "l")
plot(X.ts[0:50,2],main = "Time series plot of Atmospheric Pressure", xlab = "Time", ylab = "millibar" , col = "#1130ee", type = "l")
plot(X.ts[0:50,3],main = "Time series plot of Humidity Level", xlab = "Time", ylab = "%" , col = "#1130ee", type = "l")
plot(X.ts[0:50,4],main = "Time series plot of Exhaust Vaccum", xlab = "Time", ylab = "cm Hg" , col = "#1130ee", type = "l")
plot(Y.ts[400:450],main = "Time series plot of Energy Output", xlab = "Time", ylab = "MegaWatt" , col = "#1130ee", type = "l")
#Creating a density of all input signal X
density_x1=density(X[,"x1"])
density_x3 = density(X[,"x3"])
density_x4 = density(X[,"x4"])
density_x5 = density(X[,"x5"])
# Find the maximum y-value across all densities to set y-axis limit
max_y = max(
max(density_x1$y),
max(density_x3$y),
max(density_x4$y),
max(density_x5$y)
)
# Set up the plot area with extra space for the legend
par(mar = c(5, 10, 4, 8)) # c(bottom, left, top, right)
plot(density_x1,
main = "Density Plots of Input Signals",
xlab = "Values",
ylab = "Density",
col = "#1130ee", # Red color for temperature
ylim = c(0, max_y),
xlim = c(0,1200),
lwd = 2)
lines(density_x3, col = "#47b86d", lwd = 2) # Teal for pressure
lines(density_x4, col = "#996668", lwd = 2) # Brown for humidity
lines(density_x5, col = "#cedd22", lwd = 2) # Slateblue for vacuum
par(xpd = TRUE) # Allow plotting outside the figure region
# Add a legend
legend("topright",
legend = c("Temperature (x1)", "Pressure (x3)", "Humidity (x4)", "Vacuum (x5)"),
col = c("#1130ee", "#47b86d", "#c33c63", "#cedd22"),
lwd = 2)
par(xpd = FALSE) # Reset to default
hist(X,freq = FALSE,main = "Density plot of Input Signals",xlab = "Distribution", col = "#47b86d")
#combining Histogram of X signal with density plot
density_of_X = density(X)
hist(X,freq = FALSE,main = "Density plot with Density line plot of Input signals", xlab = "Distribution", col = "#47b86d")
lines(density_of_X,lwd=2,col="#1130ee")
rug(jitter(X))
#histogram and density plot of individual input signal X and output signal y
#Creating a density plot of input signal X1
density_of_X1=density(X[,"x1"])
hist(X[,"x1"],freq = FALSE,main = "Histogram and density plot of Temperature",xlab = "°C", col = "#47b86d")
lines(density_of_X1,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x1"]))
#Creating a density plot of input signal X3
density_of_X3=density(X[,"x3"])
hist(X[,"x3"],freq = FALSE,main = "Histogram and density plot of Pressure",xlab = "millibar", col = "#47b86d")
lines(density_of_X3,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x3"]))
#Creating a density plot of input signal X4
density_of_X4=density(X[,"x4"])
hist(X[,"x4"],freq = FALSE,main = "Histogram and density plot of Humidity",xlab = "%", col = "#47b86d")
lines(density_of_X4,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x4"]))
#Creating a density plot of input signal X5
density_of_X5=density(X[,"x5"])
hist(X[,"x5"],freq = FALSE,main = "Histogram and density plot of Vacuum",xlab = "cm Hg", col = "#47b86d")
lines(density_of_X5,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x5"]))
#Creating a density plot of output signal y
density_of_y=density(Y[,"y"])
hist(Y[,"y"],freq = FALSE,main = "Histogram and density plot of Energy Output",xlab = "MegaWatt", col = "#47b86d")
lines(density_of_y,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(Y[,"y"]))
par(mfrow=c(2,2))
# Plotting input signal X1 against output signal Y
plot(X[,"x1"],Y,main = "Correlation betweeen X1 and Y signal", xlab = "X1 signal", ylab = "Output signal y", col = "#47b86d")
# Plotting input signal X3 against output signal Y
plot(X[,"x3"],Y,main = "Correlation betweeen X3 and Y signal", xlab = "X3 signal", ylab = "Output signal y", col = "#47b86d")
# Plotting input signal X4 against output signal Y
plot(X[,"x4"],Y,main = "Correlation betweeen X4 and Y signal", xlab = "X4 signal", ylab = "Output signal y", col = "#47b86d")
# Plotting input signal X4 against output signal Y
plot(X[,"x5"],Y,main = "Correlation betweeen X5 and Y signal", xlab = "X5 signal", ylab = "Output signal y", col = "#47b86d")
# Q-Q plots to check for normality
qqnorm(X[,"x1"], col = "#47b86d",main = "QQ plot of Temperature")
qqline(X[,"x1"], col = "#1130ee")
qqnorm(X[,"x3"], col = "#47b86d",main = "QQ plot of Pressure")
qqline(X[,"x3"], col = "#1130ee")
qqnorm(X[,"x4"], col = "#47b86d",main = "QQ plot of Humidity")
qqline(X[,"x4"], col = "#1130ee")
qqnorm(X[,"x5"], col = "#47b86d",main = "QQ plot of Vacuum")
qqline(X[,"x5"], col = "#1130ee")
ones = matrix(1 , length(X)/4,1)
head(ones)
## [,1]
## [1,] 1
## [2,] 1
## [3,] 1
## [4,] 1
## [5,] 1
## [6,] 1
#Binding data from equation of model 1.
# Model 1 predictors
X_model1_raw <- cbind(X[,"x4"], X[,"x3"]^2)
# Scale predictors (not the intercept)
X_model1_scaled <- scale(X_model1_raw)
# Add intercept back
X_model1 <- cbind(ones, X_model1_scaled)
head(X_model1)
## [,1] [,2] [,3]
## [1,] 1 -0.4073356 -1.02213385
## [2,] 1 -0.3130402 0.21910945
## [3,] 1 -1.0286750 0.08963496
## [4,] 1 -1.0168880 -0.45270382
## [5,] 1 0.6518038 -1.02845500
## [6,] 1 0.4699484 -1.11294823
#Calculating thetahat of Model 1
Model1_thetahat=solve(t(X_model1) %*% X_model1) %*% t(X_model1) %*% Y
Model1_thetahat
## y
## [1,] 454.365009
## [2,] 3.348278
## [3,] -13.211452
#For model 2
#Binding data from equation of model 2.
# Model 2 predictors
X_model2_raw <- cbind(X[,"x4"],X[,"x3"]^2,X[,"x5"])
# Scale predictors (not the intercept)
X_model2_scaled <- scale(X_model2_raw)
# Add intercept back
X_model2 <- cbind(ones, X_model2_scaled)
head(X_model2)
## [,1] [,2] [,3] [,4]
## [1,] 1 -0.4073356 -1.02213385 1.14388457
## [2,] 1 -0.3130402 0.21910945 0.06102779
## [3,] 1 -1.0286750 0.08963496 -2.15057533
## [4,] 1 -1.0168880 -0.45270382 0.23842179
## [5,] 1 0.6518038 -1.02845500 1.63634126
## [6,] 1 0.4699484 -1.11294823 0.77334345
#Calculating thetahat of Model 2
Model2_thetahat=solve(t(X_model2) %*% X_model2) %*% t(X_model2) %*% Y
Model2_thetahat
## y
## [1,] 454.365009
## [2,] 3.432657
## [3,] -12.406622
## [4,] 2.517326
#Model 3
#Binding data from equation of model 3.
# Model 3 predictors
X_model3_raw <- cbind(X[,"x3"],X[,"x4"],X[,"x5"]^3)
# Scale predictors (not the intercept)
X_model3_scaled <- scale(X_model3_raw)
# Add intercept back
X_model3 <- cbind(ones, X_model3_scaled)
head(X_model3)
## [,1] [,2] [,3] [,4]
## [1,] 1 -1.0651493 -0.4073356 1.26528251
## [2,] 1 0.3292596 -0.3130402 -0.13534394
## [3,] 1 0.2041406 -1.0286750 -1.59790073
## [4,] 1 -0.3632234 -1.0168880 0.05807104
## [5,] 1 -1.0738054 0.6518038 2.09103873
## [6,] 1 -1.1918422 0.4699484 0.72486945
#Calculating thetahat of Model 3
Model3_thetahat=solve(t(X_model3) %*% X_model3) %*% t(X_model3) %*% Y
Model3_thetahat
## y
## [1,] 454.365009
## [2,] -12.722943
## [3,] 3.402683
## [4,] 2.315840
#For model 4
#Binding data from equation of model 4.
# Model 4 predictors
X_model4_raw <- cbind(X[,"x4"],(X[,"x3"])^2,(X[,"x5"])^3)
# Scale predictors (not the intercept)
X_model4_scaled <- scale(X_model4_raw)
# Add intercept back
X_model4 <- cbind(ones, X_model4_scaled)
head(X_model4)
## [,1] [,2] [,3] [,4]
## [1,] 1 -0.4073356 -1.02213385 1.26528251
## [2,] 1 -0.3130402 0.21910945 -0.13534394
## [3,] 1 -1.0286750 0.08963496 -1.59790073
## [4,] 1 -1.0168880 -0.45270382 0.05807104
## [5,] 1 0.6518038 -1.02845500 2.09103873
## [6,] 1 0.4699484 -1.11294823 0.72486945
#Calculating thetahat of Model 4
Model4_thetahat=solve(t(X_model4) %*% X_model4) %*% t(X_model4) %*% Y
Model4_thetahat
## y
## [1,] 454.365009
## [2,] 3.487220
## [3,] -12.402038
## [4,] 2.487015
# for Model 5
#Binding data from equation of model 5.
# Model 5 predictors
X_model5_raw <- cbind((X[,"x4"]),(X[,"x1"])^2,(X[,"x3"])^2)
# Scale predictors (not the intercept)
X_model5_scaled <- scale(X_model5_raw)
# Add intercept back
X_model5 <- cbind(ones, X_model5_scaled)
head(X_model5)
## [,1] [,2] [,3] [,4]
## [1,] 1 -0.4073356 -1.2815789 -1.02213385
## [2,] 1 -0.3130402 0.4034159 0.21910945
## [3,] 1 -1.0286750 1.5247554 0.08963496
## [4,] 1 -1.0168880 -0.2687469 -0.45270382
## [5,] 1 0.6518038 -1.0416049 -1.02845500
## [6,] 1 0.4699484 -0.8490286 -1.11294823
#Calculating thetahat of Model 5
Model5_thetahat=solve(t(X_model5) %*% X_model5) %*% t(X_model5) %*% Y
Model5_thetahat
## y
## [1,] 454.365009
## [2,] 1.349107
## [3,] -10.605331
## [4,] -5.173124
#model1
Model1_thetahat
## y
## [1,] 454.365009
## [2,] 3.348278
## [3,] -13.211452
t(Model1_thetahat)
## [,1] [,2] [,3]
## y 454.365 3.348278 -13.21145
#model 2
Model2_thetahat
## y
## [1,] 454.365009
## [2,] 3.432657
## [3,] -12.406622
## [4,] 2.517326
t(Model2_thetahat)
## [,1] [,2] [,3] [,4]
## y 454.365 3.432657 -12.40662 2.517326
#model 3
Model3_thetahat
## y
## [1,] 454.365009
## [2,] -12.722943
## [3,] 3.402683
## [4,] 2.315840
t(Model3_thetahat)
## [,1] [,2] [,3] [,4]
## y 454.365 -12.72294 3.402683 2.31584
#model 4
Model4_thetahat
## y
## [1,] 454.365009
## [2,] 3.487220
## [3,] -12.402038
## [4,] 2.487015
t(Model4_thetahat)
## [,1] [,2] [,3] [,4]
## y 454.365 3.48722 -12.40204 2.487015
#model 5
Model5_thetahat
## y
## [1,] 454.365009
## [2,] 1.349107
## [3,] -10.605331
## [4,] -5.173124
t(Model5_thetahat)
## [,1] [,2] [,3] [,4]
## y 454.365 1.349107 -10.60533 -5.173124
#Calculating Y-hat and RSS for each model
#Calculating Y-hat and RSS Model 1
Y_hat_model1 = X_model1 %*% Model1_thetahat
head(Y_hat_model1)
## y
## [1,] 466.5050
## [2,] 450.4221
## [3,] 449.7365
## [4,] 456.9411
## [5,] 470.1348
## [6,] 470.6422
#Calculating RSS
RSS_Model_1=sum((Y-Y_hat_model1)^2)
head(RSS_Model_1)
## [1] 657248.2
# Calculating Y-hat and RSS of model 2
Y_hat_model2 = X_model2 %*% Model2_thetahat
head(Y_hat_model2)
## y
## [1,] 468.5275
## [2,] 450.7257
## [3,] 444.3082
## [4,] 457.0911
## [5,] 473.4813
## [6,] 471.7329
#Calculating RSS
RSS_Model_2=sum((Y-Y_hat_model2)^2)
head(RSS_Model_2)
## [1] 602347.1
# Calculating Y-hat and RSS of model 3
Y_hat_model3 = X_model3 %*% Model3_thetahat
head(Y_hat_model3)
## y
## [1,] 469.4610
## [2,] 448.7972
## [3,] 444.5670
## [4,] 455.6606
## [5,] 475.0874
## [6,] 472.8065
#Calculating RSS
RSS_Model_3=sum((Y-Y_hat_model3)^2)
head(RSS_Model_3)
## [1] 547491.6
# Calculating Y-hat and RSS of model 4
Y_hat_model4 = X_model4 %*% Model4_thetahat
head(Y_hat_model4)
## y
## [1,] 468.7679
## [2,] 450.2194
## [3,] 445.6921
## [4,] 456.5778
## [5,] 474.5934
## [6,] 471.6094
#Calculating RSS
RSS_Model_4=sum((Y-Y_hat_model4)^2)
head(RSS_Model_4)
## [1] 603630.7
# Calculating Y-hat and RSS of model 5
Y_hat_model5 = X_model5 %*% Model5_thetahat
head(Y_hat_model5)
## y
## [1,] 472.6947
## [2,] 448.5308
## [3,] 436.3430
## [4,] 458.1852
## [5,] 471.6113
## [6,] 469.7607
#Calculating RSS
RSS_Model_5=sum((Y-Y_hat_model5)^2)
head(RSS_Model_5)
## [1] 365625
#printing RSS value
model1 <- c(RSS_Model_1)
model2 <- c(RSS_Model_2)
model3 <- c(RSS_Model_3)
model4 <- c(RSS_Model_4)
model5 <- c(RSS_Model_5)
dfRSS <- data.frame(model1, model2,model3,model4,model5)
dfRSS
## model1 model2 model3 model4 model5
## 1 657248.2 602347.1 547491.6 603630.7 365625
#Task 2.3 Calculating likelihood and Variance of each model
N=length(Y)
#Calculating the Variance of Model 1
Variance_model1=RSS_Model_1/(N-1)
Variance_model1
## [1] 68.69951
#Calculating the log-likelihood of Model 1
likehood_Model_1=
-(N/2)*(log(2*pi))-(N/2)*(log(Variance_model1))-(1/(2*Variance_model1))*RSS_Model_1
likehood_Model_1
## [1] -33810.99
#Calculating Variance and log-likelihood of Model 2
Variance_model2=RSS_Model_2/(N-1)
Variance_model2
## [1] 62.96091
likehood_Model_2=
-(N/2)*(log(2*pi))-(N/2)*(log(Variance_model2))-(1/(2*Variance_model2))*RSS_Model_2
likehood_Model_2
## [1] -33393.69
#Calculating Variance and log-likelihood of Model 3
Variance_model3=RSS_Model_3/(N-1)
Variance_model3
## [1] 57.2271
likehood_Model_3=
-(N/2)*(log(2*pi))-(N/2)*(log(Variance_model3))-(1/(2*Variance_model3))*RSS_Model_3
likehood_Model_3
## [1] -32936.88
#Calculating Variance and log-likelihood of Model 4
Variance_model4=RSS_Model_4/(N-1)
Variance_model4
## [1] 63.09509
likehood_Model_4=
-(N/2)*(log(2*pi))-(N/2)*(log(Variance_model4))-(1/(2*Variance_model4))*RSS_Model_4
likehood_Model_4
## [1] -33403.88
#Calculating Variance and log-likelihood of Model 5
Variance_model5=RSS_Model_5/(N-1)
Variance_model5
## [1] 38.21731
likehood_Model_5=
-(N/2)*(log(2*pi))-(N/2)*(log(Variance_model5))-(1/(2*Variance_model5))*RSS_Model_5
likehood_Model_5
## [1] -31005.4
#printing variance values
model1 <- c(Variance_model1)
model2 <- c(Variance_model2)
model3 <- c(Variance_model3)
model4 <- c(Variance_model4)
model5 <- c(Variance_model5)
dfVariance <- data.frame(model1, model2,model3,model4,model5)
dfVariance
## model1 model2 model3 model4 model5
## 1 68.69951 62.96091 57.2271 63.09509 38.21731
#printing likelihood values
model1 <- c(likehood_Model_1)
model2 <- c(likehood_Model_2)
model3 <- c(likehood_Model_3)
model4 <- c(likehood_Model_4)
model5 <- c(likehood_Model_5)
dfLikelihood <- data.frame(model1, model2,model3,model4,model5)
dfLikelihood
## model1 model2 model3 model4 model5
## 1 -33810.99 -33393.69 -32936.88 -33403.88 -31005.4
# Calculating AIC and BIC of model 1
K_model1<-length(Model1_thetahat)
K_model1
## [1] 3
AIC_model1=2*K_model1-2*likehood_Model_1
AIC_model1
## [1] 67627.98
BIC_model1=K_model1*log(N)-2*likehood_Model_1
BIC_model1
## [1] 67649.48
## thetahat of model 2
K_model2<-length(Model2_thetahat)
K_model2
## [1] 4
##Calculating AIC and BIC of model 2
AIC_model2=2*K_model2-2*likehood_Model_2
AIC_model2
## [1] 66795.38
BIC_model2=K_model2*log(N)-2*likehood_Model_2
BIC_model2
## [1] 66824.05
## thetahat of model 3
K_model3<-length(Model3_thetahat)
K_model3
## [1] 4
##Calculating AIC and BIC of model 3
AIC_model3=2*K_model3-2*likehood_Model_3
AIC_model3
## [1] 65881.77
BIC_model3=K_model3*log(N)-2*likehood_Model_3
BIC_model3
## [1] 65910.43
## thetahat of model 4
K_model4<-length(Model4_thetahat)
K_model4
## [1] 4
##Calculating AIC and BIC of model 4
AIC_model4=2*K_model4-2*likehood_Model_4
AIC_model4
## [1] 66815.75
BIC_model4=K_model4*log(N)-2*likehood_Model_4
BIC_model4
## [1] 66844.42
## thetahat of model 5
K_model5<-length(Model5_thetahat)
K_model5
## [1] 4
##Calculating AIC and BIC of model 5
AIC_model5=2*K_model5-2*likehood_Model_5
AIC_model5
## [1] 62018.79
BIC_model5=K_model5*log(N)-2*likehood_Model_5
BIC_model5
## [1] 62047.46
#printing K values
model1 <- c(K_model1)
model2 <- c(K_model2)
model3 <- c(K_model3)
model4 <- c(K_model4)
model5 <- c(K_model5)
dfK <- data.frame(model1, model2,model3,model4,model5)
dfK
## model1 model2 model3 model4 model5
## 1 3 4 4 4 4
#printing AIC values
model1 <- c(AIC_model1)
model2 <- c(AIC_model2)
model3 <- c(AIC_model3)
model4 <- c(AIC_model4)
model5 <- c(AIC_model5)
dfAIC <- data.frame(model1, model2,model3,model4,model5)
dfAIC
## model1 model2 model3 model4 model5
## 1 67627.98 66795.38 65881.77 66815.75 62018.79
#printing BIC values
model1 <- c(BIC_model1)
model2 <- c(BIC_model2)
model3 <- c(BIC_model3)
model4 <- c(BIC_model4)
model5 <- c(BIC_model5)
dfBIC <- data.frame(model1, model2,model3,model4,model5)
dfBIC
## model1 model2 model3 model4 model5
## 1 67649.48 66824.05 65910.43 66844.42 62047.46
par(mfrow=c(1,1))
## Error of model1
model1_error <- Y-Y_hat_model1
head(model1_error)
## y
## [1,] 13.9749910
## [2,] -4.6721095
## [3,] -10.9765113
## [4,] -3.8510601
## [5,] -5.7048141
## [6,] 0.3178101
## Plotting the graph QQplot and QQ line of model 1
qqnorm(model1_error, col = "#47b86d",main = "QQ plot of Model 1 (y = θ1x4 + θ2x3² + θbias)")
qqline(model1_error, col = "#1130ee",lwd=1)
## Error of model2
model2_error <- Y-Y_hat_model2 # error of model 2
## Plotting QQplot and QQ line of model 2
qqnorm(model2_error, col = "#47b86d",main = "QQ plot of Model 2 (y = θ1x4 + θ2x3² + θ3x5 + θbias)")
qqline(model2_error, col = "#1130ee")
## Error of model3
model3_error <- Y- Y_hat_model3
## Plotting QQplot and QQ line of model 3
qqnorm(model3_error, col = "#47b86d",main = "QQ plot of Model 3 (y = θ1x3 + θ2x4 + θ3x5 + θbias)")
qqline(model3_error, col = "#1130ee")
## Error of model4
model4_error <- Y-Y_hat_model4
## Plotting QQplot and QQ line of model 4
qqnorm(model4_error, col = "#47b86d",main = "QQ plot of Model 4 (y = θ1x4 + θ2x3² + θ3x5³ + θbias)")
qqline(model4_error, col = "#1130ee")
## Error of model5
model5_error <- Y- Y_hat_model5
## Plotting QQplot and QQ line of model 5
qqnorm(model5_error, col = "#47b86d",main = "QQ plot of Model 5 (y = θ1x4 + θ2x1² + θ3x3³ + θbias)")
qqline(model5_error, col = "#1130ee")
#also plotting normal distribution graph of training data
# Function to calculate Z-scores
calculate_z_scores <- function(data) {
if(is.matrix(data) || is.data.frame(data)) {
return(apply(data, 2, function(x) abs((x - mean(x)) / sd(x))))
} else {
return(abs((data - mean(data)) / sd(data)))
}
}
# Calculate Z-scores for input variables X
z_scores_X <- calculate_z_scores(X)
# Identify rows where any X variable has a Z-score > 3.0
outlier_rows_X <- which(apply(z_scores_X, 1, function(row) any(row > 3.0)))
# Print summary of outliers in predictors only
cat("Number of outliers in input variables:", length(outlier_rows_X), "\n")
## Number of outliers in input variables: 58
cat("Percentage of data with outliers:", round(100 * length(outlier_rows_X) / nrow(X), 2), "%\n")
## Percentage of data with outliers: 0.61 %
# Remove outliers from both X and Y (maintaining correspondence)
X_clean <- X[-outlier_rows_X, ]
Y_clean <- Y[-outlier_rows_X, ]
# Verify dimensions match
cat("\nDimensions after outlier removal:\n")
##
## Dimensions after outlier removal:
cat("X dimensions:", dim(X_clean), "\n")
## X dimensions: 9510 4
cat("Y dimensions:", length(Y_clean), "\n")
## Y dimensions: 9510
cat("Remaining data points:", nrow(X_clean), "\n")
## Remaining data points: 9510
# Optional: Print statistical summary of cleaned data
cat("\nSummary of cleaned input variables:\n")
##
## Summary of cleaned input variables:
print(colMeans(X_clean))
## x1 x3 x4 x5
## 19.69280 54.36676 1013.19211 73.32460
cat("\nSummary of cleaned target variable:\n")
##
## Summary of cleaned target variable:
cat("Mean:", mean(Y_clean), "\n")
## Mean: 454.2696
cat("SD:", sd(Y_clean), "\n")
## SD: 17.03703
## Spliting the dataset y into Traning and testing data set.
split_Y<-initial_split(data = as.data.frame(Y),prop=.7)
## Traning splitted Y dataset
Y_training_set<-training(split_Y)
Y_testing_set<-as.matrix(testing(split_Y))
## Testing splitted Y dataset
Y_training_data<-as.matrix(Y_training_set)
## Spliting the dataset of X into Traning and testing data set.
split_X<-initial_split(data = as.data.frame(X),prop=.7)
## Traning splitted X dataset
X_training_set<-training(split_X)
## Testing splitted X dataset
X_testing_set<-as.matrix(testing(split_X))
X_testing_data<-as.matrix(X_testing_set)
X_training_data<-as.matrix(X_training_set)
# Create raw training predictors for model 5
X_train_model_raw <- cbind(
x4 = X_training_set[,"x4"],
x1_sq = X_training_set[,"x1"]^2,
x3_sq = X_training_set[,"x3"]^2
)
# Apply scaling
X_train_scaled <- scale(X_train_model_raw)
# Save scaling parameters
center_vals <- attr(X_train_scaled, "scaled:center")
scale_vals <- attr(X_train_scaled, "scaled:scale")
# Create final training matrix with intercept
training_ones <- matrix(1, nrow(X_train_scaled), 1)
X_traning_model <- cbind(training_ones, X_train_scaled)
# Estimate thetahat
traning_thetahat <- solve(t(X_traning_model) %*% X_traning_model) %*%
t(X_traning_model) %*% Y_training_data
# === Apply same scaling to test set === #
X_test_model_raw <- cbind(
x4 = X_testing_set[,"x4"],
x1_sq = X_testing_set[,"x1"]^2,
x3_sq = X_testing_set[,"x3"]^2
)
# Apply the same centering and scaling from training set
X_test_scaled <- scale(X_test_model_raw, center = center_vals, scale = scale_vals)
# Final test matrix
test_ones <- matrix(1, nrow(X_test_scaled), 1)
X_test_model <- cbind(test_ones, X_test_scaled)
### Estimating model parameters using Traning set
#traning_ones=matrix(1 , length(X_training_set$x1),1)
# selected model 2 and using equation of model 5
#X_traning_model<-cbind(traning_ones,X_training_set[,"x4"],(X_training_set[,"x1"])^2,(X_training_set[,"x3"])^2)
traning_thetahat=solve(t(X_traning_model) %*% X_traning_model) %*% t(X_traning_model) %*% Y_training_data
### Model out/Prediction
Y_testing_hat = X_test_model %*% traning_thetahat
head(Y_testing_hat)
## y
## [1,] 453.8734
## [2,] 454.4426
## [3,] 454.5583
## [4,] 454.4864
## [5,] 454.1396
## [6,] 454.4734
RSS_testing <- sum((Y_testing_set-Y_testing_hat)^2)
head(RSS_testing)
## [1] 841630.7
t.test(X_traning_model, mu=500, alternative="two.sided", conf.level=0.95)
##
## One Sample t-test
##
## data: X_traning_model
## t = -84480, df = 26787, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
## 0.2384051 0.2615949
## sample estimates:
## mean of x
## 0.25
C_I1=-0.2783950
C_I2=0.2762101
p2 <- plot(density(X_traning_model), col="#1130ee", lwd=2,
main="Distribution of Traning Data")
abline(v=C_I1,col="#996668", lty=2)
abline(v=C_I2,col="#996668", lty=2)
thetaHat_training =solve(t(X_training_data) %*% X_training_data) %*% t(X_training_data) %*%Y_training_data
thetaHat_training
## y
## x1 0.37277590
## x3 -0.05581042
## x4 0.43921267
## x5 0.06901410
length(thetaHat_training)
## [1] 4
dis_test=density(Y_training_data)
plot((dis_test), main = "Density plot of Output Training Data", col = "#1130ee")
# === Proper 95% Confidence Interval Calculation === #
# 1. Calculate residuals and estimate error variance from training data
Y_training_hat <- X_traning_model %*% traning_thetahat
residuals <- Y_training_data - Y_training_hat
RSS_train <- sum(residuals^2)
n_train <- nrow(X_traning_model)
p <- ncol(X_traning_model) # Number of parameters (including intercept)
sigma_squared <- RSS_train / (n_train - p) # Residual variance estimate
# 2. Calculate parameter covariance matrix
param_cov_matrix <- sigma_squared * solve(t(X_traning_model) %*% X_traning_model)
# 3. Calculate prediction intervals for each test point
n_test <- nrow(X_test_model)
lower_CI <- numeric(n_test)
upper_CI <- numeric(n_test)
t_crit <- qt(0.975, df = n_train - p) # Critical t-value for 95% CI
for (i in 1:n_test) {
x_i <- X_test_model[i,]
# Prediction variance has two components:
# 1. Variance due to parameter uncertainty: x_i' * (X'X)^(-1) * x_i * sigma^2
# 2. Variance due to random error: sigma^2
pred_var_params <- t(x_i) %*% param_cov_matrix %*% x_i
pred_variance <- sigma_squared + as.numeric(pred_var_params)
# Calculate margin of error
margin <- t_crit * sqrt(pred_variance)
# Store CI bounds
lower_CI[i] <- Y_testing_hat[i] - margin
upper_CI[i] <- Y_testing_hat[i] + margin
}
# === Create Required Visualization === #
# Create data frame for plotting
plot_data <- data.frame(
Index = 1:n_test,
Actual = Y_testing_set,
Predicted = Y_testing_hat,
Lower_CI = lower_CI,
Upper_CI = upper_CI
)
# Calculate performance metrics
MSE <- mean((Y_testing_set - Y_testing_hat)^2)
RMSE <- sqrt(MSE)
R_squared <- 1 - sum((Y_testing_set - Y_testing_hat)^2) / sum((Y_testing_set - mean(Y_testing_set))^2)
cat("Model 5 Performance Metrics:\n")
## Model 5 Performance Metrics:
cat("MSE:", MSE, "\n")
## MSE: 293.149
cat("RMSE:", RMSE, "\n")
## RMSE: 17.12159
cat("R-squared:", R_squared, "\n")
## R-squared: -5.585531e-05
# Create plot using base R
par(mar = c(5, 5, 4, 2)) # Increase left margin for y-axis label
# Sort data points by actual values for clearer visualization
sorted_indices <- order(Y_testing_set)
# Plot actual vs. sample index
plot(plot_data$Index[sorted_indices],
Y_testing_set[sorted_indices],
type = "p",
pch = 16,
col = "#47b86d",
xlab = "Test Sample Index",
ylab = "Energy Output (MW)",
main = "Model 5: Predictions with 95% Confidence Intervals",
ylim = range(c(lower_CI, upper_CI, Y_testing_set)))
# Add prediction line
lines(plot_data$Index[sorted_indices],
Y_testing_hat[sorted_indices],
col = "#1130ee",
lwd = 2)
# Add confidence interval as a shaded area
polygon(c(plot_data$Index[sorted_indices], rev(plot_data$Index[sorted_indices])),
c(lower_CI[sorted_indices], rev(upper_CI[sorted_indices])),
col = rgb(0.7, 0.7, 0.7, 0.3),
border = NA)
# Add error bars for select points (to avoid overcrowding)
for (i in seq(1, length(sorted_indices), 10)) {
idx <- sorted_indices[i]
segments(plot_data$Index[idx], lower_CI[idx],
plot_data$Index[idx], upper_CI[idx],
col = "gray40")
}
# Add legend
legend("topleft",
legend = c("Actual Values", "Predicted Values", "95% Confidence Interval"),
col = c("#47b86d", "#1130ee", "gray70"),
pch = c(16, NA, 15),
lty = c(NA, 1, NA),
pt.cex = c(1, 0, 2),
bg = "white")
# Add result text
mtext(paste("RMSE =", round(RMSE, 2),
" R² =", round(R_squared, 3)),
side = 3,
line = 0.5,
cex = 0.8)
#Task 3
## Model 5 will be used, parameter are selected and kept constant.
arr_1=0
arr_2=0
f_value=0
s_value=0
Model5_thetahat
## y
## [1,] 454.365009
## [2,] 1.349107
## [3,] -10.605331
## [4,] -5.173124
#values from thetahat
thetebias <- 454.365009 #selected parameter
thetaone <- 1.349107 # selected parameter
thetatwo <- -10.605331 # constant value
thetathree <- -5.173124 # constant value
Epison <- RSS_Model_5 * 2 ## fixing value of eplision
num <- 100 #number of iteration
##Calculating Y-hat for performing rejection ABC
counter <- 0
for (i in 1:num) {
range1 <- runif(1, -454.365009, 454.365009) # calculating the range
range1
range2 <- runif(1, -1.349107, 1.349107)
New_thetahat <- matrix(c(range1,range2,thetatwo,thetathree))
New_Y_Hat <- X_model5 %*% New_thetahat ## calculating new Y-hat
new_RSS <- sum((Y-New_Y_Hat)^2)
new_RSS
if (new_RSS > Epison){
arr_1[i] <- range1
arr_2[i] <- range2
counter = counter+1
f_value <- matrix(arr_1)
s_value <- matrix(arr_2)
}
}
hist(f_value , col = "#47b86d")
hist(s_value , col = "#47b86d")
###ploting Joint and Marginal Posterior Distribution of the graph
plot(f_value,s_value, col = c("#47b86d", "#1130ee"), main = "Joint and Marginal Posterior Distribution (Rejection Range)")
par(mfrow=c(1,1))